MyAnimelist.net scraped reviews and info of 5953 animes. 1. web scraping 2. reviews: EDA 3. anime info: network graph eda 4. predict anime score
library(ggplot2)
library(dplyr)
AnimeReviews <- readRDS("AnimeReviews/AnimeReviews.RDS")
summary(AnimeReviews)
## anime_title rating anime_url review_text
## Length:89244 Min. : 0.000 Length:89244 Length:89244
## Class :character 1st Qu.: 6.000 Class :character Class :character
## Mode :character Median : 8.000 Mode :character Mode :character
## Mean : 7.545
## 3rd Qu.: 9.000
## Max. :10.000
## review_time reviewer pic_url
## Length:89244 Length:89244 Length:89244
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## date
## Min. :2006-11-07 00:00:00
## 1st Qu.:2012-01-06 00:00:00
## Median :2014-04-01 00:00:00
## Mean :2013-08-25 01:41:54
## 3rd Qu.:2015-10-04 00:00:00
## Max. :2016-11-07 00:00:00
#head(AnimeReviews)
summ_review <- AnimeReviews %>%
select(anime_title) %>%
group_by(anime_title) %>%
summarise(Count = n()) %>%
arrange(desc(Count))
summary(summ_review$Count)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 1.00 3.00 15.02 11.00 1112.00
head(summ_review, 3)
## # A tibble: 3 × 2
## anime_title Count
## <chr> <int>
## 1 Sword Art Online 1112
## 2 Angel Beats! 731
## 3 Death Note 656
p1 <- ggplot(AnimeReviews) +
geom_freqpoly(aes(x=date), bins = 60) +
xlab("Review Date") + ylab("Number of Reviews") +
theme_bw()
p1
review1 <- AnimeReviews[AnimeReviews$anime_title == "Sword Art Online", ]
p2 <- ggplot(review1) +
geom_freqpoly(aes(x=date), bins = 30) +
xlab("Review Date") + ylab("Number of Reviews") +
theme_bw() + ggtitle("Reviews of 'Sword Art Online'")
p2
library(visNetwork)
library(car)
AnimeInfo <- readRDS("AnimeInfo/AnimeInfo.RDS")
length(AnimeInfo)
## [1] 5953
names(AnimeInfo[[1]])
## [1] "anime_title" "anime_status" "anime_popularity"
## [4] "anime_studios" "anime_genres" "anime_favorites"
## [7] "anime_synopsis" "anime_aired" "anime_premiered"
## [10] "anime_related" "anime_mainactors" "anime_producers"
## [13] "anime_score" "anime_ranked" "anime_staff"
## [16] "anime_rating" "anime_background" "anime_episodes"
## [19] "anime_type" "anime_members"
set.seed(0)
graph_list <- sample(AnimeInfo, 100) ##randomly choose 100 anime for network graph
mklink_lst <- function(lst){
link_list <- data.frame(from = numeric(), to = numeric(),
value = numeric(), title = character())
l <- length(lst)
#########check common elements in staff and cast members #####################################
actor_lst <-lapply(lst, function(x) x$anime_mainactors)
staff_lst <-lapply(lst, function(x) unlist(x$anime_staff))
for (i in c(1:l-1)){
for (j in c(i+1:l)){
temp_actor <- intersect(unlist(actor_lst[i]), unlist(actor_lst[j]))
temp_staff <- intersect(unlist(staff_lst[i]), unlist(staff_lst[j]))
if (length(temp_actor) > 0 | length(temp_staff) > 0){
if (length(temp_actor) > 0){
title_actor_temp <- paste0("Cast: ", temp_actor, collapse = '<br>')
}
if (length(temp_staff) > 0){
title_staff_temp <- paste0("Staff: ", temp_staff, collapse = '<br>')
}
if (length(temp_actor) > 0 & length(temp_staff) > 0){
title_temp <- paste0(title_staff_temp,'<br>', title_actor_temp)
value_t <- length(temp_staff) + length(temp_actor)
}else if (length(temp_actor) > 0){
title_temp <- title_actor_temp
value_t <- length(temp_actor)
}else{
title_temp <- title_staff_temp
value_t <- length(temp_staff)
}
row <- data.frame(from = i, to = j,
value = value_t, title = title_temp)
link_list <- rbind(link_list, row)
}
}
}
return(link_list)
}
mknode_lst <- function(lst){
node_list <- data.frame(ID = c(1:length(lst)))
titles <- unlist(lapply(lst, function(x) x$anime_title))
node_list$Titles <- titles
types <- unlist(lapply(lst, function(x) x$anime_type))
node_list$Types <- types
members <- unlist(lapply(lst, function(x) x$anime_members))
members <- as.numeric(gsub("\\,", "", members))
node_list$Members <- members
colnames(node_list) <- c("id", "title", "group", "size")
node_list$size <- 2*sqrt(sqrt(node_list$size))
node_list$label <- NA
return(node_list)
}
link_list <- mklink_lst(graph_list)
node_list<- mknode_lst(graph_list)
graph <- visNetwork(nodes = node_list, edges = link_list, main = "Anime") %>%
visNodes(label = NULL) %>%
visLegend(enabled = TRUE, useGroups = TRUE, addNodes = NULL,
addEdges = NULL, width = 0.2, position = "left", main = NULL) %>%
visOptions(selectedBy = "group", highlightNearest = TRUE)
graph
Title <- unlist(lapply(AnimeInfo, function(x) x$anime_title))
Type <- unlist(lapply(AnimeInfo, function(x) x$anime_type))
Rating <- unlist(lapply(AnimeInfo, function(x) x$anime_rating))
Viewer <-unlist(lapply(AnimeInfo, function(x) as.numeric(gsub("\\,", "", x$anime_members))))
Score <- unlist(lapply(AnimeInfo, function(x) as.numeric(x$anime_score)))
Favorite <- unlist(lapply(AnimeInfo, function(x) as.numeric(gsub("\\,", "", x$anime_favorites))))
Actor1_lst <- lapply(AnimeInfo, function(x) x$anime_mainactor[1])
flag <- unlist(lapply(Actor1_lst, function(x) is.null(x[[1]])))
Actor1_lst[flag] <- NA
Actor1 <- unlist(Actor1_lst)
Actor2_lst <- lapply(AnimeInfo, function(x) x$anime_mainactor[2])
flag <- unlist(lapply(Actor2_lst, function(x) is.null(x[[1]])))
Actor2_lst[flag] <- NA
Actor2 <- unlist(Actor2_lst)
Actor3_lst <- lapply(AnimeInfo, function(x) x$anime_mainactor[3])
flag <- unlist(lapply(Actor3_lst, function(x) is.null(x[[1]])))
Actor3_lst[flag] <- NA
Actor3 <- unlist(Actor3_lst)
Actor4_lst <- lapply(AnimeInfo, function(x) x$anime_mainactor[4])
flag <- unlist(lapply(Actor4_lst, function(x) is.null(x[[1]])))
Actor4_lst[flag] <- NA
Actor4 <- unlist(Actor4_lst)
Director_lst <- lapply(AnimeInfo, function(x) x$anime_staff$Director)
flag <- unlist(lapply(Director_lst, function(x) is.null(x[[1]])))
Director_lst[flag] <- NA
Director <- unlist(Director_lst)
Musician_lst <- lapply(AnimeInfo, function(x) x$anime_staff$Music)
flag <- unlist(lapply(Musician_lst, function(x) is.null(x[[1]])))
Musician_lst[flag] <- NA
Musician <- unlist(Musician_lst)
AnimeInfo_df <- data.frame(Title = Title, Type = Type, Rating = Rating,
Director = Director, Actor1 = Actor1,
Actor2 = Actor2, Actor3 = Actor3,
Actor4 = Actor4, Musician = Musician,
Score = Score,
Viewer = Viewer, Favorite = Favorite
)
table(AnimeInfo_df$Rating)
##
## G - All Ages None
## 686 31
## PG - Children PG-13 - Teens 13 or older
## 351 2815
## R - 17+ (violence & profanity) R+ - Mild Nudity
## 722 628
## Rx - Hentai
## 720
#Upon investigation, all animes missing 'Rating' are old animes produced before rating system came and they all look #like "G"-rating animes.
#Set all their ratings to "G - All Ages".
AnimeInfo_df$Rating[AnimeInfo_df$Rating== "None"] <- "G - All Ages"
AnimeInfo_df$Rating <- as.character(AnimeInfo_df$Rating)
AnimeInfo_df$Rating <- as.factor(AnimeInfo_df$Rating)
# dummify director variable
# list of anime directors who won awards in last 15-20 years at Tokyo Anime Award or Animation Kobe
direct_award <- c("Daichi, Akitaro", "Miyazaki, Hayao", "Hara, Keiichi", "Kon, Satoshi",
"Miyazaki, Hayao", "Tomino, Yoshiyuki", "Hosoda, Mamoru", "Anno, Hideaki",
"Miyazaki, Hayao", "Hosoda, Mamoru", "Yonebayashi, Hiromasa", "Shinbo, Akiyuki",
"Hosoda, Mamoru", "Araki, Tetsuro", "Takahata, Isao", "Fujita, Yōichi",
"Anno, Hideaki", "Miyazaki, Hayao", "Watanabe, Shinichi", "Daichi, Akitaro",
"Okiura, Hiroyuki", "Kitakubo, Hiroyuki", "Hara, Keiichi", "Kuroda, Yosuke",
"Kamiyama, Kenji", "Yoshida, Kenichi", "Nagahama, Hiroshi", "Imaishi, Hiroyuki",
"Iso, Mitsuo", "Kato, Kunio", "Hosoda, Mamoru", "Okada, Mari",
"Agematsu, Noriyasu", "Mizushima, Tsutomu","Kishi, Seiji", "Mizushima, Seiji")
direct_award <- unique(direct_award)
AnimeInfo_df$Award_Director <- ifelse(AnimeInfo_df$Director %in% direct_award, TRUE, FALSE)
## didn't have enough time to do the same thing on actors and musicians
##multi-variable linear regression
model <- glm(Score ~ Award_Director + Viewer + Favorite + Type + Rating, data = AnimeInfo_df)
summary(model)
##
## Call:
## glm(formula = Score ~ Award_Director + Viewer + Favorite + Type +
## Rating, data = AnimeInfo_df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.7681 -0.3781 0.0714 0.4819 3.2648
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 6.735e+00 3.486e-02 193.173
## Award_DirectorTRUE 3.733e-01 5.212e-02 7.163
## Viewer 5.395e-06 2.327e-07 23.182
## Favorite -3.646e-05 5.283e-06 -6.901
## TypeMusic -5.566e-01 8.036e-02 -6.926
## TypeONA -7.764e-01 5.816e-02 -13.349
## TypeOVA -2.281e-01 3.493e-02 -6.530
## TypeSpecial 5.017e-03 3.966e-02 0.127
## TypeTV 1.777e-02 3.040e-02 0.584
## RatingPG - Children -8.318e-03 4.906e-02 -0.170
## RatingPG-13 - Teens 13 or older 2.321e-01 3.262e-02 7.115
## RatingR - 17+ (violence & profanity) 1.474e-01 4.143e-02 3.559
## RatingR+ - Mild Nudity -1.865e-01 4.274e-02 -4.364
## RatingRx - Hentai -1.337e-01 4.632e-02 -2.886
## Pr(>|t|)
## (Intercept) < 2e-16 ***
## Award_DirectorTRUE 8.82e-13 ***
## Viewer < 2e-16 ***
## Favorite 5.72e-12 ***
## TypeMusic 4.80e-12 ***
## TypeONA < 2e-16 ***
## TypeOVA 7.12e-11 ***
## TypeSpecial 0.899335
## TypeTV 0.558947
## RatingPG - Children 0.865357
## RatingPG-13 - Teens 13 or older 1.25e-12 ***
## RatingR - 17+ (violence & profanity) 0.000375 ***
## RatingR+ - Mild Nudity 1.30e-05 ***
## RatingRx - Hentai 0.003916 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.5564911)
##
## Null deviance: 4651.4 on 5952 degrees of freedom
## Residual deviance: 3305.0 on 5939 degrees of freedom
## AIC: 13421
##
## Number of Fisher Scoring iterations: 2
plot(model)
avPlots(model)
vif(model)
## GVIF Df GVIF^(1/(2*Df))
## Award_Director 1.025117 1 1.012481
## Viewer 3.089501 1 1.757698
## Favorite 2.654953 1 1.629403
## Type 1.853266 5 1.063638
## Rating 1.751659 5 1.057657
model_2 <- glm(Score ~ Award_Director + Favorite + Type + Rating, data = AnimeInfo_df)
summary(model_2)
##
## Call:
## glm(formula = Score ~ Award_Director + Favorite + Type + Rating,
## data = AnimeInfo_df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.7822 -0.4149 0.0797 0.5036 3.2468
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 6.753e+00 3.640e-02 185.553
## Award_DirectorTRUE 4.748e-01 5.423e-02 8.755
## Favorite 5.871e-05 3.473e-06 16.904
## TypeMusic -6.097e-01 8.388e-02 -7.269
## TypeONA -8.341e-01 6.067e-02 -13.747
## TypeOVA -3.166e-01 3.625e-02 -8.732
## TypeSpecial -4.088e-02 4.136e-02 -0.988
## TypeTV 1.211e-01 3.140e-02 3.858
## RatingPG - Children -6.464e-03 5.122e-02 -0.126
## RatingPG-13 - Teens 13 or older 3.537e-01 3.362e-02 10.519
## RatingR - 17+ (violence & profanity) 3.558e-01 4.223e-02 8.425
## RatingR+ - Mild Nudity -2.208e-02 4.400e-02 -0.502
## RatingRx - Hentai -4.841e-02 4.821e-02 -1.004
## Pr(>|t|)
## (Intercept) < 2e-16 ***
## Award_DirectorTRUE < 2e-16 ***
## Favorite < 2e-16 ***
## TypeMusic 4.1e-13 ***
## TypeONA < 2e-16 ***
## TypeOVA < 2e-16 ***
## TypeSpecial 0.322989
## TypeTV 0.000116 ***
## RatingPG - Children 0.899588
## RatingPG-13 - Teens 13 or older < 2e-16 ***
## RatingR - 17+ (violence & profanity) < 2e-16 ***
## RatingR+ - Mild Nudity 0.615886
## RatingRx - Hentai 0.315376
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.6067446)
##
## Null deviance: 4651.4 on 5952 degrees of freedom
## Residual deviance: 3604.1 on 5940 degrees of freedom
## AIC: 13934
##
## Number of Fisher Scoring iterations: 2
plot(model_2)
avPlots(model_2)
vif(model_2)
## GVIF Df GVIF^(1/(2*Df))
## Award_Director 1.017893 1 1.008907
## Favorite 1.052151 1 1.025744
## Type 1.672155 5 1.052756
## Rating 1.634392 5 1.050354
model_dev <- model$deviance
model_df <- model$df.residual
model_2_dev <- model_2$deviance
model_2_df <- model_2$df.residual
pchisq(model_2_dev - model_dev, model_2_df - model_df, lower.tail = FALSE)
## [1] 5.273187e-67
anova(model_2, model, test = "Chisq")
## Analysis of Deviance Table
##
## Model 1: Score ~ Award_Director + Favorite + Type + Rating
## Model 2: Score ~ Award_Director + Viewer + Favorite + Type + Rating
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 5940 3604.1
## 2 5939 3305.0 1 299.06 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
R2 <- 1-model$deviance/model$null.deviance
R2
## [1] 0.2894562
The partial F-test yields p-value of < 2.2e-16, variable “Viewer” does add information to the model. Significant coeffiecients: (Intercept), Award_DirectorTRUE, Favorite, Viewer, TypeMusic, TypeONA, TypeOVA, RatingPG-13 - Teens 13 or older, RatingR - 17+ (violence & profanity), RatingR+ - Mild Nudity, RatingRx - Hentai.
Adjusted R-squared: 0.2236 only explains 22.36% of the variance in scores of animes.